# string manipulation
library(stringr)
# data wrangling
library(dplyr)
library(tidyr)
library(purrr)
# plot
library(ggplot2)
# write/read text files
library(readr)
In case some libraries are not found, they have to be installed first and then loaded. Here below an example:
install.packages(c("dplyr", "tidyr"))
library(dplyr)
library(tidyr)
Specify the directory containing raw files:
files_dir <- "../data/raw/Wos advanced search query results_verwerking/"
Retrieve all input text files:
files_all <- dir(files_dir)
files <- files_all[which(str_detect(files_all, pattern = "txt"))]
files[]
## [1] "WoS_search_query_data_2011_WE01.txt"
## [2] "WoS_search_query_data_2011_WE02.txt"
## [3] "WoS_search_query_data_2011_WE04.txt"
## [4] "WoS_search_query_data_2011_WE05.txt"
## [5] "WoS_search_query_data_2011_WE06.txt"
## [6] "WoS_search_query_data_2011_WE07.txt"
## [7] "WoS_search_query_data_2011_WE08.txt"
## [8] "WoS_search_query_data_2011_WE09.txt"
## [9] "WoS_search_query_data_2011_WE10.txt"
## [10] "WoS_search_query_data_2011_WE11.txt"
## [11] "WoS_search_query_data_2011_WE12.txt"
## [12] "WoS_search_query_data_2011_WE13.txt"
## [13] "WoS_search_query_data_2011_WE14.txt"
## [14] "WoS_search_query_data_2011_WE15.txt"
## [15] "WoS_search_query_data_2012_WE01.txt"
## [16] "WoS_search_query_data_2012_WE02.txt"
## [17] "WoS_search_query_data_2012_WE04.txt"
## [18] "WoS_search_query_data_2012_WE05.txt"
## [19] "WoS_search_query_data_2012_WE06.txt"
## [20] "WoS_search_query_data_2012_WE07.txt"
## [21] "WoS_search_query_data_2012_WE08.txt"
## [22] "WoS_search_query_data_2012_WE09.txt"
## [23] "WoS_search_query_data_2012_WE10.txt"
## [24] "WoS_search_query_data_2012_WE11.txt"
## [25] "WoS_search_query_data_2012_WE12.txt"
## [26] "WoS_search_query_data_2012_WE13.txt"
## [27] "WoS_search_query_data_2012_WE14.txt"
## [28] "WoS_search_query_data_2012_WE15.txt"
## [29] "WoS_search_query_data_2013_WE01.txt"
## [30] "WoS_search_query_data_2013_WE02.txt"
## [31] "WoS_search_query_data_2013_WE04.txt"
## [32] "WoS_search_query_data_2013_WE05.txt"
## [33] "WoS_search_query_data_2013_WE06.txt"
## [34] "WoS_search_query_data_2013_WE07.txt"
## [35] "WoS_search_query_data_2013_WE08.txt"
## [36] "WoS_search_query_data_2013_WE09.txt"
## [37] "WoS_search_query_data_2013_WE10.txt"
## [38] "WoS_search_query_data_2013_WE11.txt"
## [39] "WoS_search_query_data_2013_WE12.txt"
## [40] "WoS_search_query_data_2013_WE13.txt"
## [41] "WoS_search_query_data_2013_WE14.txt"
## [42] "WoS_search_query_data_2013_WE15.txt"
## [43] "WoS_search_query_data_2014_WE01.txt"
## [44] "WoS_search_query_data_2014_WE02.txt"
## [45] "WoS_search_query_data_2014_WE04.txt"
## [46] "WoS_search_query_data_2014_WE05.txt"
## [47] "WoS_search_query_data_2014_WE06.txt"
## [48] "WoS_search_query_data_2014_WE07.txt"
## [49] "WoS_search_query_data_2014_WE08.txt"
## [50] "WoS_search_query_data_2014_WE09.txt"
## [51] "WoS_search_query_data_2014_WE10.txt"
## [52] "WoS_search_query_data_2014_WE11.txt"
## [53] "WoS_search_query_data_2014_WE12.txt"
## [54] "WoS_search_query_data_2014_WE13.txt"
## [55] "WoS_search_query_data_2014_WE14.txt"
## [56] "WoS_search_query_data_2014_WE15.txt"
## [57] "WoS_search_query_data_2015_WE01.txt"
## [58] "WoS_search_query_data_2015_WE02.txt"
## [59] "WoS_search_query_data_2015_WE04.txt"
## [60] "WoS_search_query_data_2015_WE05.txt"
## [61] "WoS_search_query_data_2015_WE06.txt"
## [62] "WoS_search_query_data_2015_WE07.txt"
## [63] "WoS_search_query_data_2015_WE08.txt"
## [64] "WoS_search_query_data_2015_WE09.txt"
## [65] "WoS_search_query_data_2015_WE10.txt"
## [66] "WoS_search_query_data_2015_WE11.txt"
## [67] "WoS_search_query_data_2015_WE12.txt"
## [68] "WoS_search_query_data_2015_WE13.txt"
## [69] "WoS_search_query_data_2015_WE14.txt"
## [70] "WoS_search_query_data_2015_WE15.txt"
## [71] "WoS_search_query_data_2016_WE01.txt"
## [72] "WoS_search_query_data_2016_WE02.txt"
## [73] "WoS_search_query_data_2016_WE04.txt"
## [74] "WoS_search_query_data_2016_WE05.txt"
## [75] "WoS_search_query_data_2016_WE06.txt"
## [76] "WoS_search_query_data_2016_WE07.txt"
## [77] "WoS_search_query_data_2016_WE08.txt"
## [78] "WoS_search_query_data_2016_WE09.txt"
## [79] "WoS_search_query_data_2016_WE10.txt"
## [80] "WoS_search_query_data_2016_WE11.txt"
## [81] "WoS_search_query_data_2016_WE12.txt"
## [82] "WoS_search_query_data_2016_WE13.txt"
## [83] "WoS_search_query_data_2016_WE14.txt"
## [84] "WoS_search_query_data_2016_WE15.txt"
Import the text files containing the raw output of WOS queries:
raw_WOS_output <- map(files, ~ read_csv(
str_c(files_dir, ., sep = "/"),
skip = 3, col_names = FALSE, col_types = cols(X2 = col_character())))
Analysis will benefit of tidying our data. In tidy data:
In our case, year and departement are two important variables.
Get year and departement identifiers WExx from filenames:
deps <- map_chr(files, ~
str_sub(unlist(
str_split(., pattern = "_"))[6],
start = 1, end = 4))
years <- map_chr(files, ~
str_sub(unlist(
str_split(., pattern = "_"))[5],
start = 1, end = 4))
Add departement and year to each data frame:
raw_WOS_df <- map2(raw_WOS_output, deps,
function(x, d) mutate(x, dep = d))
raw_WOS_df <- map2(raw_WOS_df, years,
function(x, y) mutate(x, year = y))
Some rows do not contain any relevant information:
raw_WOS_df[[1]] %>%
filter(is.na(X2) & is.na(X3))
We remove them from all data frames:
raw_WOS_df <- map(raw_WOS_df,
function(x) filter(x, (!is.na(X2) | !is.na(X3))))
Some data frames contain rows with journal titles in second column instead of third one:
raw_WOS_df[[1]] %>%
filter(is.na(as.numeric(X2))) %>%
filter(!is.na(X2))
In these cases we should copy the content of the second column to the third one. We remove these titles from second column, as it should contain year of publication instead. We also give appropriate names to the columns. We call this list of data frames clean_WOS_dfs:
clean_WOS_dfs <- map(raw_WOS_df,
function(x)
mutate(x, X3 = case_when(
is.na(X3) ~ X2,
TRUE ~ X3
)))
# change names to columns & set publication_year as integer
clean_WOS_dfs <- map(clean_WOS_dfs,
function(x)
select(
mutate(x, publication_year = as.numeric(X2),
journal = X3, author = X1),
-starts_with("X"))
)
We can now merge all the data frames together, thus creating a complete tidy data frame:
tidy_WOS_df <- bind_rows(clean_WOS_dfs)
Some journals are in lowercase. It means they are not A1 journals. Some examples:
tidy_WOS_df %>%
filter(!journal == toupper(journal)) %>%
head(n = 10)
We remove them:
tidy_WOS_df <- tidy_WOS_df %>%
filter(journal == toupper(journal))
Some preview (randomly picked up) of the tidy data frame:
tidy_WOS_df %>%
sample_n(20) %>%
arrange(dep)
Save the tidy data frame, as it is the base of any further analysis:
write_tsv(tidy_WOS_df,
path = "../data/processed/tidy_WOS_df.txt",
na = "")
First, we calculate the total number of cited journals per departement per year:
total_n_journals <- tidy_WOS_df %>%
group_by(dep, year) %>%
# use distinct() in order to not count multiple citations from same journal
distinct(journal) %>%
count() %>%
rename(tot_n_journals = n)
total_n_journals
See graphs below:
More details about changes in number of cited journals for each departement:
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
Maximum and minimum:
## [1] "Maximum: 3660 (dep WE11, year 2015)"
## [1] "Minimum: 107 (dep WE15, year 2016)"
Some stats for each departement:
stats_cited_journals <- total_n_journals %>%
group_by(dep) %>%
summarize(
mean_journals = as.integer(mean(tot_n_journals)),
st_dev_journals = as.integer(sd(tot_n_journals)),
perc_variability = as.integer(st_dev_journals/mean_journals*100)) %>%
arrange(desc(mean_journals))
stats_cited_journals
We first show how many journals have been cited per each departement and year more than x time, with x between 1 and 10:
limit <- 1:10
tot_n_journals <- tidy_WOS_df %>%
group_by(dep, year, journal) %>%
count() %>%
arrange(dep, year, desc(n)) %>%
ungroup()
more_less_limit <- map(limit, function(x)
tot_n_journals %>%
mutate(more_or_less = if_else(n > x,
"+", "- or =")) %>%
mutate(limit = x))
# merge in a single data frame
more_less_limit <- bind_rows(more_less_limit) %>%
ungroup()
stat_more_less_limit <- more_less_limit %>%
group_by(dep, year, limit, more_or_less) %>%
summarize(n_journals = n()) %>%
ungroup() %>%
left_join(total_n_journals, by = c("dep", "year")) %>%
mutate(perc_n_journals = round(n_journals/tot_n_journals*100))
Stacked histogram:
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
By using a fixed threshold the proportion of journals cited more than x times by each departement will be never the same. So, alternatively, we can fix the percentage of cited journals and then we calculate the percentiles based on total number of cited journals by each departement. We use percentiles from 5% to 30% with steps of 5%:
rank_journals <- tot_n_journals %>%
group_by(dep, year) %>%
mutate(rank = rank(desc(n), ties.method = "min")) %>%
ungroup() %>%
left_join(total_n_journals, by = c("dep","year")) %>%
mutate(perc_rank = rank / tot_n_journals * 100)
limit_perc <- seq(5, 30, 5)
names(limit_perc) <- str_c("perc_rank", limit_perc, sep = "_")
n_journals_less <- map_df(limit_perc, function(x)
rank_journals %>%
filter(perc_rank < x) %>%
group_by(dep, year) %>%
summarize(percentile = x,
n_journals_less_perc = n(),
journals = paste(journal, collapse = ","))) %>%
ungroup()
n_journals_less
Plot per departement and year:
map(limit_perc, function(x)
ggplot(n_journals_less %>%
filter(percentile == x),
aes(x = dep, y = n_journals_less_perc)) +
geom_col() +
facet_wrap(~ year) +
theme(strip.text = element_text(size=12),
axis.text.x = element_text(angle = 90, hjust = 1, size = 8)))
## $perc_rank_5
##
## $perc_rank_10
##
## $perc_rank_15
##
## $perc_rank_20
##
## $perc_rank_25
##
## $perc_rank_30
The column journals contains the journals which are sufficiently cited to be included in this percentile-based threshold. They are also ordered by number of citations (rank). We save this data frame:
write_tsv(n_journals_less,
path = "../data/processed/percentile_threshold_journals.txt",
na = "")